{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Factorization Machines with tf\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 根据user/items的id，建立稀疏矩阵\n",
    "参考：https://gist.github.com/babakx/7a3fc9739b7778f6673a458605e18963"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "from itertools import count\n",
    "from collections import defaultdict\n",
    "from scipy.sparse import csr\n",
    "import numpy as np\n",
    "\n",
    "def vectorize_dic(dic,ix=None,p=None,n=0,g=0):\n",
    "    # dic -- dictionary of feature lists. Keys are the name of features\n",
    "    # ix -- index generator (default None)\n",
    "    # p -- dimension of feature space (number of columns in the sparse matrix) (default None)\n",
    "    # n -- num sample\n",
    "    # g -- num group: eg: uese/items---> g=2\n",
    "    \n",
    "    if ix==None:\n",
    "        ix = dict()\n",
    "    \n",
    "    \n",
    "    nz = n * g # number of non-zores\n",
    "\n",
    "    col_ix = np.empty(nz,dtype = int)\n",
    "\n",
    "    i = 0\n",
    "    for k,lis in dic.items():\n",
    "        for t in range(len(lis)):\n",
    "            ix[str(lis[t]) + str(k)] = ix.get(str(lis[t]) + str(k),0) + 1\n",
    "            # 附加索引'l'以防止将具有相同id的不同列映射到同一个索引\n",
    "            col_ix[i+t*g] = ix[str(lis[t]) + str(k)]\n",
    "        i += 1\n",
    "\n",
    "    row_ix = np.repeat(np.arange(0,n),g)\n",
    "    data = np.ones(nz)\n",
    "    if p == None:\n",
    "        p = len(ix)\n",
    "\n",
    "    ixx = np.where(col_ix < p)\n",
    "    return csr.csr_matrix((data[ixx],(row_ix[ixx],col_ix[ixx])),shape=(n,p))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Loading data\n",
    "使用MovieLens100k的数据，将数据转化成稀疏矩阵"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.feature_extraction import DictVectorizer\n",
    "\n",
    "cols = ['user','item','rating','timestamp']\n",
    "train = pd.read_csv('data/ua.base',delimiter='\\t',names=cols)\n",
    "test = pd.read_csv('data/ua.test', delimiter='\\t', names=cols)\n",
    "\n",
    "x_train = vectorize_dic({'users':train['user'].values, 'items':train['item'].values},n=len(train.index),g=2)\n",
    "x_test= vectorize_dic({'users':test['user'].values,'items':test['item'].values},ix,x_train.shape[1],n=len(test.index),g=2)\n",
    "\n",
    "y_train = train.rating.values\n",
    "y_test = test.rating.values"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Input To Dense\n",
    "把输入的x_train和x_test转化成dense格式，使其能被tf使用。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(90570, 2623) (9430, 2623)\n"
     ]
    }
   ],
   "source": [
    "x_train = x_train.todense()\n",
    "x_test = x_test.todense()\n",
    "\n",
    "print(x_train.shape, x_test.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 用tensorflow定义FM模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 初始化参数\n",
    "import tensorflow as tf\n",
    "\n",
    "n,p = x_train.shape\n",
    "# number 0f latent factor\n",
    "k = 10\n",
    "\n",
    "x = tf.placeholder('float',[None,p])\n",
    "y = tf.placeholder('float',[None,1])\n",
    "\n",
    "# bias and weight\n",
    "w0 = tf.Variable(tf.zeros([1]))\n",
    "w = tf.Variable(tf.zeros([p]))\n",
    "\n",
    "#interaction factors\n",
    "v = tf.Variable(tf.random_normal([k,p],mean=0,stddev=0.01))\n",
    "\n",
    "y_hat = tf.Variable(tf.zeros([n, 1]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 定义输出y的计算公式\n",
    "$$ \\hat{y}(\\mathbf{x}) = w_0 + \\sum_{j=1}^{p}w_jx_j + \\frac{1}{2} \\sum_{f=1}^{k} ((\\sum_{j=1}^{p}v_{j,f}x_j)^2-\\sum_{j=1}^{p}v_{j,f}^2 x_j^2)$$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From <ipython-input-19-174f8e1533a4>:2: calling reduce_sum (from tensorflow.python.ops.math_ops) with keep_dims is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "keep_dims is deprecated, use keepdims instead\n"
     ]
    }
   ],
   "source": [
    "# 计算FM公式的输出\n",
    "linear_terms = tf.add(w0,tf.reduce_sum(tf.multiply(w,x),1,keep_dims=True))\n",
    "pair_interactions = 0.5 * tf.reduce_sum(\n",
    "                            tf.subtract(\n",
    "                                tf.pow(tf.matmul(x,tf.transpose(v)),2),\n",
    "                                        tf.matmul(tf.pow(x,2),tf.transpose(tf.pow(v,2)))),axis=1, keep_dims=True)\n",
    "y_hat = tf.add(linear_terms, pair_interactions)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Loss function\n",
    "$$ L = \\sum_{i=1}^{n} (y_i - \\hat{y}_i)^2 + \\lambda_w ||W||^2 + \\lambda_v ||V||^2$$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# L2 reg sum of squares of loss function\n",
    "lambda_w = tf.constant(0.001, name='lambda_w')\n",
    "lambda_v = tf.constant(0.001, name='lambda_v')\n",
    "\n",
    "l2_norm = tf.reduce_sum(\n",
    "                    tf.add(\n",
    "                        tf.multiply(lambda_w, tf.pow(w,2)),\n",
    "                        tf.multiply(lambda_v, tf.pow(v,2))))\n",
    "error = tf.reduce_mean(tf.square(tf.subtract(y,y_hat)))\n",
    "loss = tf.add(error,l2_norm)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Optimization\n",
    "用SGD进行优化: $\\Theta_{i+1} = \\Theta_{i} - \\eta \\frac{\\delta L}{\\delta \\Theta}$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(loss)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Mini-batcher"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "def batcher(X_,y_=None,batch_size=-1):\n",
    "    n_samples = X_.shape[0]\n",
    "    if batch_size == -1:\n",
    "        batch_size = n_samples\n",
    "    if batch_size < 1:\n",
    "        raise ValueError('Parameter batch_size={} is unsupported'.format(batch_size))\n",
    "        \n",
    "    for i in range(0,n_samples,batch_size):\n",
    "        upper_bound = min(i + batch_size,n_samples)\n",
    "        ret_x = X_[i:upper_bound]\n",
    "        ret_y = None\n",
    "        if y_ is not None:\n",
    "            ret_y = y_[i:i + batch_size]\n",
    "            yield (ret_x,ret_y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Tensorflow graph and traing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6ccb183e948f4aa88f20186a0a31ffe8",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, max=10), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm_notebook as tqdm\n",
    "\n",
    "epochs = 10\n",
    "batch_size = 1000\n",
    "\n",
    "# tf graph\n",
    "init = tf.global_variables_initializer()\n",
    "sess = tf.Session()\n",
    "\n",
    "sess.run(init)\n",
    "\n",
    "for epochs in tqdm(range(epochs),unit='epoch'):\n",
    "    perm = np.random.permutation(x_train.shape[0])\n",
    "    # iterate over batches\n",
    "    for bX,bY in batcher(x_train[perm],y_train[perm],batch_size):\n",
    "        sess.run(optimizer, feed_dict={x: bX.reshape(-1,p), y: bY.reshape(-1,1)})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 评价模型\n",
    "用RMSE评价"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1.1257708]\n"
     ]
    }
   ],
   "source": [
    "errors = []\n",
    "for bX,bY in batcher(x_test,y_test):\n",
    "    errors.append(sess.run(error,feed_dict={x: bX.reshape(-1,p), y: bY.reshape(-1,1)}))\n",
    "\n",
    "RMSE = np.sqrt(np.array(errors))\n",
    "print(RMSE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
